Setup
#load libraries
library(ggplot2)
library(pheatmap)
library(DESeq2)
library(ggbiplot)
library(matrixStats)
library("AnnotationDbi")
library("org.Hs.eg.db")
library(pathview)
library(gage)
library(gageData)
library("survival")
library("survminer")
#Import and preprocess data
rna_seq = read.csv("RNAseq.csv", row.names=1)
clinical_patient = read.delim("data_clinical_patient.txt")
mutations = read.delim("data_mutations.txt")
Data Processing
#Preprocess data
#Filter data where there is only 0 or a read count across all samples
rna_seq <- rna_seq[rowSums(rna_seq)>1,]
RNAseq <- rna_seq
#For some reason, there's some RNA_seq samples not associated with any patient data. we remove these:
RNAseq = RNAseq[,which(substr(colnames(RNAseq),1,12) %in%
gsub("-",".",clinical_patient$X.Patient.Identifier[-c(1,2,3,4)]))]
Data Exploration
#Looking at possible factors that may have differences in gene expression
colnames(clinical_patient)
## [1] "X.Patient.Identifier"
## [2] "Subtype"
## [3] "TCGA.PanCanAtlas.Cancer.Type.Acronym"
## [4] "Other.Patient.ID"
## [5] "Diagnosis.Age"
## [6] "Sex"
## [7] "Neoplasm.Disease.Stage.American.Joint.Committee.on.Cancer.Code"
## [8] "American.Joint.Committee.on.Cancer.Publication.Version.Type"
## [9] "Last.Communication.Contact.from.Initial.Pathologic.Diagnosis.Date"
## [10] "Birth.from.Initial.Pathologic.Diagnosis.Date"
## [11] "Last.Alive.Less.Initial.Pathologic.Diagnosis.Date.Calculated.Day.Value"
## [12] "Ethnicity.Category"
## [13] "Form.completion.date"
## [14] "Neoadjuvant.Therapy.Type.Administered.Prior.To.Resection.Text"
## [15] "ICD.10.Classification"
## [16] "International.Classification.of.Diseases.for.Oncology..Third.Edition.ICD.O.3.Histology.Code"
## [17] "International.Classification.of.Diseases.for.Oncology..Third.Edition.ICD.O.3.Site.Code"
## [18] "Informed.consent.verified"
## [19] "New.Neoplasm.Event.Post.Initial.Therapy.Indicator"
## [20] "American.Joint.Committee.on.Cancer.Metastasis.Stage.Code"
## [21] "Neoplasm.Disease.Lymph.Node.Stage.American.Joint.Committee.on.Cancer.Code"
## [22] "American.Joint.Committee.on.Cancer.Tumor.Stage.Code"
## [23] "Person.Neoplasm.Cancer.Status"
## [24] "Primary.Lymph.Node.Presentation.Assessment"
## [25] "Prior.Diagnosis"
## [26] "Race.Category"
## [27] "Radiation.Therapy"
## [28] "Patient.Weight"
## [29] "In.PanCan.Pathway.Analysis"
## [30] "Overall.Survival.Status"
## [31] "Overall.Survival..Months."
## [32] "Disease.specific.Survival.status"
## [33] "Months.of.disease.specific.survival"
## [34] "Disease.Free.Status"
## [35] "Disease.Free..Months."
## [36] "Progression.Free.Status"
## [37] "Progress.Free.Survival..Months."
#Looking at sex
table(clinical_patient$Sex)
##
## 1 Female Male Sex SEX STRING
## 1 121 251 1 1 1
#looking at tumor stage
table(clinical_patient$Neoplasm.Disease.Stage.American.Joint.Committee.on.Cancer.Code)
##
##
## 21
## 1
## 1
## AJCC_PATHOLOGIC_TUMOR_STAGE
## 1
## STAGE I
## 173
## STAGE II
## 87
## STAGE III
## 3
## STAGE IIIA
## 65
## STAGE IIIB
## 9
## STAGE IIIC
## 8
## STAGE IV
## 3
## STAGE IVA
## 1
## STAGE IVB
## 2
## STRING
## 1
## The extent of a cancer, especially whether the disease has spread from the original site to other parts of the body based on AJCC staging criteria.
## 1
#looking at tumor vs tumor free
table(clinical_patient$Person.Neoplasm.Cancer.Status)
##
## 1
## 27 1
## Person neoplasm cancer status. PERSON_NEOPLASM_CANCER_STATUS
## 1 1
## STRING Tumor Free
## 1 232
## With Tumor
## 113
Analysis on patient features such as sex
countData <- rna_seq
#create conditions
temp = data.frame(sex = clinical_patient$Sex[-c(1,2,3,4)],
identity = gsub("-",".",clinical_patient$X.Patient.Identifier[-c(1,2,3,4)]))
category = vector()
knowns = vector()
for(i in colnames(countData)){
for(j in temp$identity)
if(grepl(j,i,fixed = TRUE)){
category = c(category, temp$sex[temp$identity==j])
knowns = c(knowns,i)
break
}
}
colData = data.frame(sex = category)
rownames(colData)= knowns
countData = countData[,knowns]
#Running the differential expression pipeline
dds = DESeqDataSetFromMatrix(countData=round(countData),
colData=colData,
design=~sex)
dds = DESeq(dds)
#Building the results table
res <- results(dds)
res = results(dds, contrast=c("sex", "Male", "Female"))
mcols(res, use.names = TRUE)
## DataFrame with 6 rows and 2 columns
## type description
## <character> <character>
## baseMean intermediate mean of normalized c..
## log2FoldChange results log2 fold change (ML..
## lfcSE results standard error: sex ..
## stat results Wald statistic: sex ..
## pvalue results Wald test p-value: s..
## padj results BH adjusted p-values
summary(res)
##
## out of 55071 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 3475, 6.3%
## LFC < 0 (down) : 4911, 8.9%
## outliers [1] : 0, 0%
## low counts [2] : 20292, 37%
## (mean count < 0)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
# Variance stabilizing transformation
vsd <- vst(dds)
#PCA
plotPCA(vsd,intgroup=c("sex"))
#Filter by padj value and then log2FoldChange to find significant differentially expressed genes
resSig <- subset(res, padj < 0.05)
resSig_1 <- subset(resSig, log2FoldChange < -1 | log2FoldChange > 1)
genes <- order(resSig_1$padj,decreasing = TRUE)
#patient-gene heatmap
annot_col = data.frame(colData$sex)
row.names(annot_col) <- rownames(colData)
sampleMatrix <- assay(vsd)[genes,]
rownames(sampleMatrix) = rownames(countData[genes,])
colnames(sampleMatrix) = colnames(countData)
pheatmap(sampleMatrix , cluster_rows=FALSE, show_rownames=FALSE,
show_colnames=FALSE,clustering_method = "ward.D",
cluster_cols=TRUE, annotation_col=annot_col)
Analysis on patient features such as tumor stages
countData <- rna_seq
#create conditions
temp = data.frame(stage = clinical_patient$Neoplasm.Disease.Stage.American.Joint.Committee.on.Cancer.Code[-c(1,2,3,4)],
identity = gsub("-",".",clinical_patient$X.Patient.Identifier[-c(1,2,3,4)]))
category = vector()
knowns = vector()
for(i in colnames(countData)){
for(j in temp$identity)
if(grepl(j,i,fixed = TRUE)){
category = c(category, temp$stage[temp$identity==j])
knowns = c(knowns,i)
break
}
}
colData = data.frame(stage = category)
#Grouping Data
colData[colData == "STAGE IIIA"] <- "STAGE III"
colData[colData == "STAGE IIIB"] <- "STAGE III"
colData[colData == "STAGE IIIC"] <- "STAGE III"
colData[colData == "STAGE IVA"] <- "STAGE IV"
colData[colData == "STAGE IVB"] <- "STAGE IV"
rownames(colData)= knowns
countData = countData[,knowns]
#Running the differential expression pipeline
dds = DESeqDataSetFromMatrix(countData=round(countData),
colData=colData,
design=~stage)
dds = DESeq(dds)
#Building the results table
res <- results(dds)
mcols(res, use.names = TRUE)
## DataFrame with 6 rows and 2 columns
## type description
## <character> <character>
## baseMean intermediate mean of normalized c..
## log2FoldChange results log2 fold change (ML..
## lfcSE results standard error: stag..
## stat results Wald statistic: stag..
## pvalue results Wald test p-value: s..
## padj results BH adjusted p-values
summary(res)
##
## out of 55071 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 41, 0.074%
## LFC < 0 (down) : 20, 0.036%
## outliers [1] : 0, 0%
## low counts [2] : 28834, 52%
## (mean count < 2)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
#Variance stabilizing transformation
vsd <- vst(dds)
#PCA
plotPCA(vsd,intgroup=c("stage"))
#Filter by padj value and then log2FoldChange to find significant differentially expressed genes
resSig <- subset(res, padj < 0.05)
resSig_1 <- subset(resSig, log2FoldChange < -1 | log2FoldChange > 1)
genes <- order(resSig_1$padj,decreasing = TRUE)
#patient-gene heatmap
annot_col = data.frame(colData$stage)
row.names(annot_col) <- rownames(colData)
sampleMatrix <- assay(vsd)[genes,]
rownames(sampleMatrix) = rownames(countData[genes,])
colnames(sampleMatrix) = colnames(countData)
pheatmap(sampleMatrix , cluster_rows=FALSE, show_rownames=FALSE,
show_colnames=FALSE,clustering_method = "ward.D",
cluster_cols=TRUE, annotation_col=annot_col)
Expression data analysis
#get variance of each expression:
Expression_variance = data.frame(Expressions = rownames(RNAseq))
Expression_variance$variance = rowSds(as.matrix(RNAseq))
Expression_variance = Expression_variance[order(Expression_variance$variance, decreasing=TRUE),]
#Take the top 500 expressions in terms of variance, then do PCA analysis:
Top_500 = Expression_variance$Expressions[1:500]
Culled_RNA = scale(t(RNAseq[Top_500,]))
C.RNA.pca = prcomp(Culled_RNA, center = TRUE, scale. = TRUE)
#Choose the PC's that add up to 90% of the variance:
totvar = 0
PCcount = 0 #This is the number of PCs to use in clustering
while (totvar < 0.9){
PCcount = PCcount + 1
totvar = totvar + summary(C.RNA.pca)$importance[2,PCcount]
}
#Create clusters based on that PC's that show 90% of variance
distmat = dist(C.RNA.pca$x[,1:PCcount], method = 'euclidean')
hclust_ward = hclust(distmat, method = 'ward.D')
plot(hclust_ward, labels = FALSE)
rect.hclust(hclust_ward, k = 4, border = 2:6)
fourclust = cutree(hclust_ward, k = 4)
colData1 = data.frame(cluster = fourclust)
colData1$cluster <- sapply(colData1$cluster, as.character)
dds = DESeqDataSetFromMatrix(countData = RNAseq,
colData=colData1,
design=~cluster)
dds = DESeq(dds)
res = results(dds)
#Get only the significant and differentially expressed genes
res_sig = subset(res, padj < 0.05)
res_sig = subset(res_sig, log2FoldChange < -1 | log2FoldChange > 1)
res_sig = res_sig[order(res_sig$log2FoldChange, decreasing = TRUE), ]
#Variance stabilizing transformation
RNAseq2.0 = RNAseq[rownames(res_sig),]
vsd1 <- vst(dds)
#Patient-gene heatmap
annot_col = data.frame(colData1$cluster)
row.names(annot_col) <- rownames(colData1)
sampleMatrix <- assay(vsd1)[rownames(res_sig),]
rownames(sampleMatrix) = rownames(RNAseq2.0)
colnames(sampleMatrix) = colnames(RNAseq2.0)
pheatmap(sampleMatrix , cluster_rows=FALSE, show_rownames=TRUE, show_colnames = FALSE,
cluster_cols=TRUE, clustering_method = "ward.D", annotation_col=annot_col)
#PCA plot
pcaData <- plotPCA(vsd1, intgroup=c("cluster"), returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color=cluster)) +
geom_point(size=3) +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance")) +
coord_fixed()
#Pathway analysis on the patient cluster DE genes
tmp=gsub("\\..*","",row.names(res))
res$entrez = mapIds(org.Hs.eg.db,
keys=tmp,
column="ENTREZID",
keytype="ENSEMBL",
multiVals="first")
data(kegg.sets.hs)
data(sigmet.idx.hs)
# Focus on signaling and metabolic pathways only
kegg.sets.hs = kegg.sets.hs[sigmet.idx.hs]
#Look at all significant genes/pathways - padj < 0.06
#We were getting trouble with GAGE when trying to get pathways with padj < 0.05 so we used 0.06 instead
res = subset(res, padj < 0.06)
foldchanges = res$log2FoldChange
names(foldchanges) = res$entrez
foldchanges <- foldchanges[!is.na(names(foldchanges))]
head(foldchanges)
## 64102 2268 2519 2729 4800 6405
## -1.2030675 0.6069746 0.2269985 -0.4761025 0.2182152 -0.2633002
keggres = gage(foldchanges, gsets=kegg.sets.hs)
## Focus on top upregulated and downregulated pathways
keggrespathwaysup <- rownames(keggres$greater)[1] #change to [1:5] to look at top 5 pathways
keggrespathwaysdown <- rownames(keggres$less)[1]
# Extract the 8 character long IDs part of each string
keggresidsup = substr(keggrespathwaysup, start=1, stop=8)
keggresidsdown = substr(keggrespathwaysdown, start=1, stop=8)
pathview(gene.data=foldchanges, pathway.id=keggresidsup, species="hsa")
pathview(gene.data=foldchanges, pathway.id=keggresidsdown, species="hsa")
#Survival analysis on the patient clusters
clin_df = clinical_patient[,
c("X.Patient.Identifier",
"Overall.Survival.Status",
"Overall.Survival..Months.")]
clin_df = clin_df[-c(1,2,3,4),]
Overall_survival = c()
survival_months = c()
#Possible affected groups
age = c()
race = c()
sex = c()
stage = c()
for (i in rownames(colData1)){
index = which(gsub("-",".",clin_df$X.Patient.Identifier) == substr(i,1,12))
Overall_survival = c(Overall_survival, clin_df$Overall.Survival.Status[index])
survival_months = c(survival_months, clin_df$Overall.Survival..Months.[index])
sex = c(sex, clinical_patient$Sex[index+4])
race = c(race, clinical_patient$Race.Category[index+4])
age = c(age, clinical_patient$Diagnosis.Age[index+4])
stage = c(stage, clinical_patient$Neoplasm.Disease.Stage.American.Joint.Committee.on.Cancer.Code[index+4])
}
colData1$Overall.Survival.Status = Overall_survival
colData1$Overall.Survival..Months. = survival_months
colData1$sex = sex
colData1$age = age
colData1$race = race
colData1$stage = stage
colData1$deceased = colData1$Overall.Survival.Status == "1:DECEASED"
colData1$Overall.Survival..Months. <- as.numeric(colData1$Overall.Survival..Months.)
fit = survfit(Surv(Overall.Survival..Months., deceased) ~ cluster, data=colData1)
print(fit)
## Call: survfit(formula = Surv(Overall.Survival..Months., deceased) ~
## cluster, data = colData1)
##
## 1 observation deleted due to missingness
## n events median 0.95LCL 0.95UCL
## cluster=1 175 56 70.1 45.1 NA
## cluster=2 99 48 30.6 19.8 60.9
## cluster=3 52 35 37.3 25.3 55.7
## cluster=4 92 24 83.2 55.4 NA
ggsurvplot(fit, data=colData1, pval=T, risk.table=T, risk.table.height=0.35)
We wanted to look at potential clinical factors that affected cluster 3
table(colData1$sex[which(colData1$cluster==3)])
##
## Female Male
## 23 29
table(colData1$age[which(colData1$cluster==3)])
##
## 20 23 29 37 40 42 43 45 46 50 51 52 53 54 55 57 61 62 64 65 66 67 68 69 70 71
## 1 2 1 1 1 1 1 1 1 1 1 2 1 1 1 1 2 2 1 1 2 2 1 2 2 1
## 72 73 74 75 76 77 78 81
## 3 1 1 4 5 1 1 2
table(colData1$race[which(colData1$cluster==3)])
##
## Asian Black or African American
## 3 6 6
## White
## 37
table(colData1$stage[which(colData1$cluster==3)])
##
## STAGE I STAGE II STAGE III STAGE IIIA STAGE IIIC STAGE IV
## 9 18 12 3 8 1 1
#None seems promising so we wanted to look at mutation data
Creating confusion matrix based of presence of top 5 mutated and expressed genes in cluster 3
mutationgroup = rownames(colData1)[which(colData1$cluster==3)]
mutations$cluster = ifelse(substr(gsub("-",".",mutations$Tumor_Sample_Barcode),1,12)
%in% substr(mutationgroup,1,12), "3","0")
test = table(mutations$Hugo_Symbol[which(mutations$cluster == "3")])
mostfrequent = order(test, decreasing = TRUE)[1:10]
#ANKHD1 is one of the top 50 mutated gene and also one of the top 5 expressed gene in cluster 3
hasmutation = substr(gsub("-",".",mutations$Tumor_Sample_Barcode[which(mutations$Hugo_Symbol == "ANKHD1")]),1,12)
prediction = ifelse(colData1$cluster == 3,TRUE,FALSE)
real = ifelse(substr(rownames(colData1),1,12) %in% hasmutation, TRUE, FALSE)
table(prediction, real)
## real
## prediction FALSE TRUE
## FALSE 355 12
## TRUE 47 5
#ANKHD1, a top mutated gene, has an accuracy of ~85% which shows that it is prevalent within cluster 3
Survival analysis on patient with mutation in ANKHD1 gene
#Only looking at patient with mutation in specific gene - ANKHD1 - found in cluster 3
groupA <- substr(mutations$Tumor_Sample_Barcode[grep("ANKHD1",mutations$Hugo_Symbol)],1,12)
clinical_patient$group <- ifelse(clinical_patient$X.Patient.Identifier %in% groupA,
"groupA",
ifelse(!(clinical_patient$X.Patient.Identifier %in% groupA), "groupB", NA))
# we are only interested in the "With Tumor" cases for survival
clin_df = clinical_patient[clinical_patient$Person.Neoplasm.Cancer.Status == "With Tumor",
c("X.Patient.Identifier",
"Overall.Survival.Status",
"Overall.Survival..Months.",
"group")]
# create a new boolean variable that has TRUE for dead patients
# and FALSE for live patients
clin_df$deceased = clin_df$Overall.Survival.Status == "1:DECEASED"
# Overall survival months takes into account months_to_death
#for dead patients, and to months_to_last_follow_up for patients who
# are still alive
clin_df$Overall.Survival..Months. <- as.numeric(clin_df$Overall.Survival..Months.)
# fit a survival model
fit = survfit(Surv(Overall.Survival..Months., deceased) ~ group, data=clin_df)
print(fit)
## Call: survfit(formula = Surv(Overall.Survival..Months., deceased) ~
## group, data = clin_df)
##
## 1 observation deleted due to missingness
## n events median 0.95LCL 0.95UCL
## group=groupA 3 2 18.3 12.0 NA
## group=groupB 109 59 41.8 29.6 58.9
# we produce a Kaplan Meier plot
ggsurvplot(fit, data=clin_df, pval=T, risk.table=T, risk.table.height=0.35)
Mutation Data Analysis - Looking more closely at mutation data
#Gene patient matrix
rownames = unique(mutations$Hugo_Symbol)
colnames = clinical_patient$X.Patient.Identifier[-c(1,2,3,4)]
patient_gene = matrix(0, nrow = length(rownames), ncol = length(colnames))
rownames(patient_gene) = rownames
colnames(patient_gene) = colnames
for(i in colnames){
a = mutations$Hugo_Symbol[which(grepl(i,mutations$Tumor_Sample_Barcode))]
for (j in a){patient_gene[j,i] = 1}
}
#Top 20 mutated gene and their number of occurrences
patient_gene <- cbind(patient_gene, count = rowSums(patient_gene))
patient_gene = patient_gene[order(patient_gene[,373],decreasing=TRUE),]
patient_gene[1:20,373]
## TTN TP53 CTNNB1 MUC16 OBSCN ALB PCLO CSMD3 RYR2 FLG
## 129 112 95 80 54 49 49 48 46 43
## APOB ABCA13 LRP1B XIRP2 USH2A HMCN1 GPR98 CACNA1E FAT3 FUT9
## 40 38 37 36 36 35 35 34 34 33
#Modify rna_seq so it only includes patients with mutation in one of the top 50 mutated gene
PatientID <- c()
genes <-rownames(patient_gene)
for (i in 1:50){
tmp <- mutations$Tumor_Sample_Barcode[which(mutations$Hugo_Symbol == genes[i])]
tmp <- gsub("\\-", ".", tmp)
PatientID <- append(PatientID,tmp)
}
PatientID <- unique(PatientID)
rna_seq_mod <- rna_seq
rna_seq_mod <- rna_seq_mod[ , (substr(names(rna_seq_mod),1,15) %in% PatientID)]
#DEseq on patient groups based on w/t vs w/out mutation in particular genes
countData <- rna_seq
patient <- colnames(countData)
colData <- data.frame(patient)
groupA <- colnames(rna_seq_mod) #patient with mutated gene (top 50)
colData$group <- ifelse(colData$patient %in% groupA,
"groupA",
ifelse(!(colData$patient %in% groupA), "groupB", NA))
rownames(colData) <- colData$patient
#Running the differential expression pipeline
dds = DESeqDataSetFromMatrix(countData=round(countData ),
colData=colData,
design=~group)
dds = DESeq(dds)
#Building the results table
res <- results(dds)
res = results(dds, contrast=c("group", "groupA", "groupB"))
mcols(res, use.names = TRUE)
## DataFrame with 6 rows and 2 columns
## type description
## <character> <character>
## baseMean intermediate mean of normalized c..
## log2FoldChange results log2 fold change (ML..
## lfcSE results standard error: grou..
## stat results Wald statistic: grou..
## pvalue results Wald test p-value: g..
## padj results BH adjusted p-values
summary(res)
##
## out of 55073 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 13038, 24%
## LFC < 0 (down) : 5646, 10%
## outliers [1] : 0, 0%
## low counts [2] : 19223, 35%
## (mean count < 0)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
#Variance stabilizing transformation
vsd <- vst(dds)
#PCA
plotPCA(vsd,intgroup=c("group"))
#Significant and differentially expressed genes
resSig <- subset(res, padj < 0.05)
resSig_1 <- subset(resSig, log2FoldChange < -1 | log2FoldChange > 1)
genes <- order(resSig_1$padj,decreasing = TRUE)
#Patient-gene heatmap
annot_col = data.frame(colData$group)
row.names(annot_col) <- rownames(colData)
sampleMatrix <- assay(vsd)[genes,]
rownames(sampleMatrix) = rownames(countData[genes,])
colnames(sampleMatrix) = colnames(countData)
pheatmap(sampleMatrix , cluster_rows=TRUE, show_rownames=FALSE,
show_colnames = FALSE, clustering_method = "ward.D",
cluster_cols=TRUE, annotation_col=annot_col)
#Find the gene names of the top significant and differentially expressed genes
library("AnnotationDbi")
library("org.Hs.eg.db")
ensembl_id = as.vector(rownames(sampleMatrix))
ensembl_id <- substr(ensembl_id, start=1, stop=15)
gene_name <- select(org.Hs.eg.db, keys=ensembl_id,
columns="SYMBOL", keytype="ENSEMBL")
head(gene_name)
#Pathway analysis on the patient cluster DE genes
tmp=gsub("\\..*","",row.names(res))
res$entrez = mapIds(org.Hs.eg.db,
keys=tmp,
column="ENTREZID",
keytype="ENSEMBL",
multiVals="first")
data(kegg.sets.hs)
data(sigmet.idx.hs)
# Focus on signaling and metabolic pathways only
kegg.sets.hs = kegg.sets.hs[sigmet.idx.hs]
#Look at all significant genes/pathways - padj < 0.06
#We were getting trouble with GAGE when trying to get pathways with padj < 0.05 so we used 0.06 instead
res = subset(res, padj < 0.06)
foldchanges = res$log2FoldChange
names(foldchanges) = res$entrez
foldchanges <- foldchanges[!is.na(names(foldchanges))]
head(foldchanges)
## 55732 2268 3075 2519 2729 4800
## 0.4288297 -0.6497088 -0.3444860 0.1983943 -0.2476456 0.3118879
keggres = gage(foldchanges, gsets=kegg.sets.hs)
## Focus on top upregulated and downregulated pathways
keggrespathwaysup <- rownames(keggres$greater)[1] #change to [1:5] to look at top 5 pathways
keggrespathwaysdown <- rownames(keggres$less)[1]
# Extract the 8 character long IDs part of each string
keggresidsup = substr(keggrespathwaysup, start=1, stop=8)
keggresidsdown = substr(keggrespathwaysdown, start=1, stop=8)
pathview(gene.data=foldchanges, pathway.id=keggresidsup, species="hsa")
pathview(gene.data=foldchanges, pathway.id=keggresidsdown, species="hsa")
Comparing expression with mutation analysis data
Notice that
patient cluster 3 from expression data analysis have some overlaps with
the non-mutated patient cluster (group B) from mutation data
analysis
#PCA plot for expression analysis
plotPCA(vsd1,intgroup=c("cluster"))
#PCA plot for mutation analysis where group B is the non-mutated patient cluster
plotPCA(vsd,intgroup=c("group"))
Chekc to see if survival rates of group B is similar to cluster 3
#Survival Analysis - Patients with mutation in one of the top 50 mutated gene vs without
patient <- substr(colData$patient,1,12)
patient <- gsub("\\.", "-", patient)
groupA <- patient[grep("groupA",colData$group)] #patient with mutated gene (top 50)
groupB <- patient[grep("groupB",colData$group)] #patient without mutated gene
clinical_patient$group <- ifelse(clinical_patient$X.Patient.Identifier %in% groupA,
"groupA",
ifelse(clinical_patient$X.Patient.Identifier %in% groupB, "groupB", NA))
# we are only interested in the "With Tumor" cases for survival
clin_df = clinical_patient[clinical_patient$Person.Neoplasm.Cancer.Status == "With Tumor",
c("X.Patient.Identifier",
"Overall.Survival.Status",
"Overall.Survival..Months.",
"group")]
# create a new boolean variable that has TRUE for dead patients
# and FALSE for live patients
clin_df$deceased = clin_df$Overall.Survival.Status == "1:DECEASED"
# Overall survival months takes into account months_to_death
#for dead patients, and to months_to_last_follow_up for patients who
# are still alive
clin_df$Overall.Survival..Months. <- as.numeric(clin_df$Overall.Survival..Months.)
# fit a survival model
fit = survfit(Surv(Overall.Survival..Months., deceased) ~ group, data=clin_df)
print(fit)
## Call: survfit(formula = Surv(Overall.Survival..Months., deceased) ~
## group, data = clin_df)
##
## 4 observations deleted due to missingness
## n events median 0.95LCL 0.95UCL
## group=groupA 99 56 37.3 26.4 55.7
## group=groupB 10 4 69.6 49.0 NA
# we produce a Kaplan Meier plot
ggsurvplot(fit, data=clin_df, pval=T, risk.table=T, risk.table.height=0.35)
It seems that cluster 3 and Group B may be in some way related, as their
clustering is very similar. However, this does not mean that cluster 3
is group B as shown by the discrepancy in their survival analysis
results.